Load Packages

install.packages(c("tidyverse", "dplyr","countrycode"), repos = "http://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/8b/ht4__40d6qdcxj1qqfwjdb9h0000gn/T//Rtmpkvvml0/downloaded_packages
library(dplyr)
library(tidyverse)
library(countrycode)

Reading dataset

df <- read.csv("Cost_of_Living_Index_2022.csv", header=TRUE)

# The datatype of the dataframe could probably be factor type which is why header=TRUE was not sufficient to place the first row into column names.

colnames(df) <- df[1,]
df <- df[-1,]
View(df)

Cleaing the dataset

# Let's check the summary
summary(df)
##      Rank             Country          Cost of Living Index  Rent Index       
##  Length:139         Length:139         Length:139           Length:139        
##  Class :character   Class :character   Class :character     Class :character  
##  Mode  :character   Mode  :character   Mode  :character     Mode  :character  
##  Cost of Living Plus Rent Index Groceries Index    Restaurant Price Index
##  Length:139                     Length:139         Length:139            
##  Class :character               Class :character   Class :character      
##  Mode  :character               Mode  :character   Mode  :character      
##  Local Purchasing Power Index
##  Length:139                  
##  Class :character            
##  Mode  :character
str(df)
## 'data.frame':    139 obs. of  8 variables:
##  $ Rank                          : chr  "1" "2" "3" "4" ...
##  $ Country                       : chr  "Afghanistan" "Albania" "Algeria" "Argentina" ...
##  $ Cost of Living Index          : chr  "20.37" "35.5" "26.87" "34.69" ...
##  $ Rent Index                    : chr  "2.72" "8.47" "4.59" "7.71" ...
##  $ Cost of Living Plus Rent Index: chr  "12.09" "22.83" "16.43" "22.04" ...
##  $ Groceries Index               : chr  "14.92" "29.32" "28.82" "28.17" ...
##  $ Restaurant Price Index        : chr  "12.41" "25.82" "14.48" "33.32" ...
##  $ Local Purchasing Power Index  : chr  "23.04" "30.19" "24.63" "30.72" ...
any(is.na(df))
## [1] FALSE
df = df %>% mutate(across(.cols=c(1,3:8), .fns=as.numeric))
df <- df[-1]
str(df)
## 'data.frame':    139 obs. of  7 variables:
##  $ Country                       : chr  "Afghanistan" "Albania" "Algeria" "Argentina" ...
##  $ Cost of Living Index          : num  20.4 35.5 26.9 34.7 33.9 ...
##  $ Rent Index                    : num  2.72 8.47 4.59 7.71 11.61 ...
##  $ Cost of Living Plus Rent Index: num  12.1 22.8 16.4 22 23.4 ...
##  $ Groceries Index               : num  14.9 29.3 28.8 28.2 27.6 ...
##  $ Restaurant Price Index        : num  12.4 25.8 14.5 33.3 30.6 ...
##  $ Local Purchasing Power Index  : num  23 30.2 24.6 30.7 28.9 ...

Data Exploration

1. Cost of living index - Let’s first visualize the distribution of the outcome variable

options(repr.plot.width = 14, repr.plot.height = 8)
ggplot(df, aes(x=`Cost of Living Index`))+
  geom_histogram(aes(y=..density..), position="identity", fill="#E69F00", color="#E69F00",alpha=0.6)+
  geom_density(fill= "#E69F00", alpha=0.3, color="#E69F00")+
  geom_vline(xintercept = mean(df$`Cost of Living Index`), linetype= "dashed", color="red")+
  xlim(0,100)+
  labs(title="Cost of Living Index Distribution", y="Density")+
  theme(text=element_text(size=14))

All Indices (Rent, Grocery, Purchase Power, Restaurant Price) - Let’s take a look at the distribution curve of all numerical variables.

options(repr.plot.width = 14, repr.plot.height = 8)
df %>%
  keep(is.numeric)%>%
  gather()%>%
  ggplot(aes(value, fill=value))+
  geom_histogram(aes(y=..density..), position="identity",color="blue2", fill="cornflowerblue", alpha=0.9)+
  geom_density(color="blue")+
  facet_wrap(~ key, scales ="free", ncol=2)+
  theme(text=element_text(size=14))

Top 10 countries with the highest cost of living index

options(repr.plot.width = 14, repr.plot.height = 8)
df %>%
  arrange(desc(df$`Cost of Living Index`)) %>%
  slice(1:10) %>%
  ggplot(., aes(x=reorder(Country, -`Cost of Living Index`),y=`Cost of Living Index`))+
  geom_bar(stat='identity', color="skyblue", fill="cornflowerblue")+
  scale_fill_brewer(palette = "Greens")+
  labs(x= " Country", y="Cost of Living Index", title="Top 10 Countries with Highest Cost of Living Index ")+
  theme(axis.text.x = element_text(angle=60), text = element_text(size=14), plot.title = element_text(size=13))

Top 10 countries with the highest rent index

options(repr.plot.width = 14, repr.plot.height = 8)
df %>%
  arrange(desc(df$`Rent Index`)) %>%
  slice(1:10) %>%
  ggplot(., aes(x=reorder(Country, -`Rent Index`),y=`Rent Index`))+
  geom_bar(stat='identity', color="skyblue", fill="royalblue", alpha=0.6)+
  scale_fill_brewer(palette = "Greens")+
  labs(x= "Country", y="Rent Index", title="Top 10 Countries with Highest Rent Index ")+
  theme(axis.text.x = element_text(angle=60), text = element_text(size=14), plot.title = element_text(size=13))

Linear Regression

Correlation Matrix

library(psych)
df_cor <- cor(select_if(df, is.numeric))
summary(df_cor)
##  Cost of Living Index   Rent Index     Cost of Living Plus Rent Index
##  Min.   :0.6872       Min.   :0.6381   Min.   :0.6953                
##  1st Qu.:0.8622       1st Qu.:0.7994   1st Qu.:0.9227                
##  Median :0.9500       Median :0.8191   Median :0.9379                
##  Mean   :0.9000       Mean   :0.8357   Mean   :0.9106                
##  3rd Qu.:0.9724       3rd Qu.:0.9136   3rd Qu.:0.9657                
##  Max.   :1.0000       Max.   :1.0000   Max.   :1.0000                
##  Groceries Index  Restaurant Price Index Local Purchasing Power Index
##  Min.   :0.6265   Min.   :0.6908         Min.   :0.6265              
##  1st Qu.:0.8122   1st Qu.:0.8130         1st Qu.:0.6504              
##  Median :0.8942   Median :0.8847         Median :0.6890              
##  Mean   :0.8632   Mean   :0.8658         Mean   :0.7230              
##  3rd Qu.:0.9583   3rd Qu.:0.9303         3rd Qu.:0.6942              
##  Max.   :1.0000   Max.   :1.0000         Max.   :1.0000
options(repr.plot.width = 14, repr.plot.height = 8)
corrplot::corrplot(df_cor,method="color", tl.cex=0.75, tl.col = "black")

- All variable are positively correlated to each other. - Cost of living index tends to be highly positively correlated with cost of living and rent, grocery, and restaurant price indices

Regression model

m <- lm(df$`Cost of Living Index`~., data=select_if(df, is.numeric))
summary(m)
## 
## Call:
## lm(formula = df$`Cost of Living Index` ~ ., data = select_if(df, 
##     is.numeric))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0138766 -0.0054135  0.0007739  0.0050219  0.0155829 
## 
## Coefficients:
##                                    Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)                      -1.472e-03  2.086e-03    -0.706   0.4815    
## `Rent Index`                     -8.826e-01  2.004e-04 -4403.822   <2e-16 ***
## `Cost of Living Plus Rent Index`  1.883e+00  3.714e-04  5069.842   <2e-16 ***
## `Groceries Index`                -1.593e-04  1.298e-04    -1.227   0.2220    
## `Restaurant Price Index`         -1.603e-04  8.001e-05    -2.003   0.0472 *  
## `Local Purchasing Power Index`    2.756e-05  3.120e-05     0.883   0.3787    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.006926 on 133 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.504e+08 on 5 and 133 DF,  p-value: < 2.2e-16

Interpretation

Plot of p-values

#Extraction
pvalue <- data.frame(summary(m)$coefficients[,c('Pr(>|t|)', 'Estimate')])
pvalue$var_name <- rownames(pvalue)
colnames(pvalue) <-c('p-value', 'coefficient','variables')
View(pvalue)

#Plot
options(repr.plot.width = 14, repr.plot.height = 8)
ggplot(pvalue, aes(x=reorder(`variables`, `p-value`), y=`p-value`))+
  geom_col(fill='cornflowerblue')+
  labs(title="P-value Factors on Cost of Living Index", x= " ")+
  theme(axis.text.x = element_text(angle=90), plot.title = element_text(hjust=0.5, face="bold"))+
  geom_hline(yintercept = 0.05, col="red")+
  geom_text(aes(x=2,y=0.08), label="0.05", size=4.0, color="red")

How does cost of living index vary across continents?

df$Continent <- countrycode(sourcevar = df$Country, origin = "country.name",destination = "region")

df_continent <- df %>% select(`Cost of Living Index`,`Continent`) %>%
  group_by(Continent) %>%
  summarise(Mean_Cost_of_Living_Index = mean(`Cost of Living Index`))

options(repr.plot.width = 14, repr.plot.height = 8)
ggplot(df_continent, aes(x=reorder(`Continent`,-`Mean_Cost_of_Living_Index`), y=`Mean_Cost_of_Living_Index`))+
  geom_col(color="Skyblue", fill="cornflowerblue", alpha=0.9)+
  scale_fill_brewer(palette = "Paired")+
  theme(axis.text.x = element_text(angle=90))+
  labs(y="Mean Cost of Living Index",, x="Continental Region", title="Mean Living Cost Index by Continental Region")

ggplot(df_continent, aes(x=`Continent`, y=`Mean_Cost_of_Living_Index`, fill=`Continent`))+
  geom_bar(width=1,
           stat="identity",
           color="white",
           alpha=.6)+
  coord_polar("x", start=20)+
  theme(axis.text.x=element_blank(),
        legend.position = "right", 
        legend.text = element_text(size=12),
        legend.title=element_blank(),
        axis.text.y = element_text(size=12),
        plot.title=element_text(size=12, hjust=1, face="bold"))+
  labs(x=" ", y=" ",
       title = "Mean Cost of Living Index by Continental Region")+
  scale_fill_brewer(palette = "Set3")

Geospatial Visualizations

Let’s visualize the cost of living index across countries and continents

Geospatial graph packages and libraries
install.packages(c("rworldmap", "ggplot2", "sf", "scales", "rnaturalearth", "rnaturalearthdata", "plotly"),repos = "http://cran.rstudio.com/")
## 
## The downloaded binary packages are in
##  /var/folders/8b/ht4__40d6qdcxj1qqfwjdb9h0000gn/T//Rtmpkvvml0/downloaded_packages
library(ggplot2)
library(sf)
library(rworldmap)
library(plotly)

Cost of Living Index

options(repr.plot.width = 14, repr.plot.height = 8)

index.map <- joinCountryData2Map(df, joinCode = "NAME", nameJoinColumn = "Country")
## 137 codes from your data successfully matched countries in the map
## 2 codes from your data failed to match with a country code in the map
## 106 codes from the map weren't represented in your data
par(mar=c(0,0,1,0))
mapCountryData(index.map, nameColumnToPlot="Cost of Living Index", colourPalette = "diverging")

** Interactive Map: Cost of Living Index**

df1 <- df %>%  select(`Country`, `Cost of Living Index`)
colnames(df1) <- c("region", "Cost_of_Living_Index")
mapdata <- map_data("world")
mapdata <- right_join(mapdata, df1, by="region")

options(repr.plot.width = 14, repr.plot.height = 8)

map1 <- ggplot(mapdata, aes(x= long, y= lat, group=group))+
  geom_polygon(aes(fill = Cost_of_Living_Index), color="black")+
  scale_fill_gradient(name="Cost of Living Index", low="yellow", high="red", na.value = "gray50")+
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(), 
        rect = element_blank())

map2 <- ggplotly(map1) %>%
  highlight(
    "plotly_hover",
    selected = attrs_selected(line = list(color = "black")))
map2$layout$height <- 600
map2$layout$width <- 800
map2